# SET DIRECTORY TO SOURCE FILE 

setwd("../../data")

library(tidyverse)
require(xtable)

dat = read.csv("raw/heuristics_early2018_data_pii_removed.csv", stringsAsFactors = FALSE) %>%
  filter(apisuccess == T) %>%
  filter(GC == 1) %>%
  select(-longname, -shortname, -otherparty, -partyoneletter, -apisuccess, -GC) %>%
  rename(repcandidateid = candidateid,
         presvote2016 = X2016vote,
         income = `Information.about.income.is.very.important.to.understand...Would.you.please.give.your.best.guess.Please.indicate.the.answer.that.includes.your.entire.household.income.in..previous.year..before.taxes.`)

# Merge in actual MC votes
votes = read.csv("Ancillary Data/processed_MCs_for_api.csv", stringsAsFactors = FALSE)
names(votes) <- paste0('rep', gsub('vote', 'actualvote', names(votes)))
dat = merge(dat, votes, by = "repcandidateid")

vote.ids <- c("55735", "56612", "59180",
              "59438", "58622", "58745",
              "59189", "59434", "60053",
              "60590", "60787", "59737",
              "59632", "60153", "60587",
              "61788", "61746")

# spot check recoding
spot.check <- data.frame(vote.ids, names(dat)[2:18], names(dat)[21:37])
for(i in 1:nrow(spot.check)) print(spot.check[i,])

# Rename variable names to match vote IDs.
# Own views
names(dat)[2:18] = paste0('ownview', vote.ids)
# MOC vote guesses
names(dat)[21:37] = paste0('repvoteperc', vote.ids)

# Recodes
dat[,2:18] <- 2 - dat[,2:18] # previously, 1 = for, 2 = against
dat[,21:37] <- 2 - dat[,21:37] # previously, 1 = for, 2 = against


# Data at the level of votes/SIGs potentially shown
# read in list of votes to determine which it appeared on
vote.group.map <- read.csv('Ancillary Data/vote_and_group_info_map.csv', stringsAsFactors = FALSE)
a.votes <- as.character(filter(vote.group.map, split == 'a')$house_vote_id)
b.votes <- as.character(filter(vote.group.map, split == 'b')$house_vote_id)
  

## Create new variable that records whether respondents guessed correctly.
# the vote variables are 0/1, the repvotes are 1/2; make sure they are coded correctly
for(vote in vote.ids){
  dat[,paste0('repperccorrect', vote)] <-
    as.numeric(dat[,paste0('repvoteperc', vote)] == dat[,paste0('repactualvote', vote)])
  dat[is.na(dat[,paste0('repvoteperc', vote)]), paste0('repperccorrect', vote)] <- 0 # If you didn't guess, you don't know.
  dat[is.na(dat[,paste0('repactualvote', vote)]), paste0('repperccorrect', vote)] <- NA # If there's no vote, you can't know.
  
  # If you weren't asked, you can't have known.
  if(vote %in% a.votes) dat[dat$survey_version == 'b', paste0('repperccorrect', vote)] <- NA
  if(vote %in% b.votes) dat[dat$survey_version == 'a', paste0('repperccorrect', vote)] <- NA
}

# Issue publics codings
# a votes
"55735	Sportsmen's Heritage and Recreational Enhancement (SHARE) Act of 2015
56612	Prohibits Use of Funds for Discrimination Based on Sexual Orientation or Gender Identity
59180	Working Families Flexibility Act of 2017
59438	Reducing Regulatory Burdens Act of 2017
59737	Department of Veterans Affairs Accountability and Whistleblower Protection Act of 2017
60053	Kate's Law
60590	Promoting Cross Border Energy Infrastructure Act
60787	Countering America's Adversaries Through Sanctions Act"
dat$issuepubsubj55735 <- dat$issuepubsa...Gun.rights...gun.control
dat$issuepubsubj56612 <- dat$issuepubsa...LGBT..Lesbian..Gay..Bisexual..and.Transgender..issues
dat$issuepubsubj59180 <- dat$issuepubsa...Overtime.pay
dat$issuepubsubj59438 <- dat$issuepubsa...Environmental.protection
dat$issuepubsubj59737 <- dat$issuepubsa...Veterans.issues
dat$issuepubsubj60053 <- dat$issuepubsa...Immigration
dat$issuepubsubj60590 <- paste0(dat$issuepubsa...Energy.policy, dat$issuepubsa...Environmental.protection)
dat$issuepubsubj60787 <- dat$issuepubsa...Foreign.policy

dat$issuepubobj55735 <- dat$issuepubobjectivea...Owns.a.gun.
dat$issuepubobj56612 <- dat$issuepubobjectivea...Identifies.as.LGBT..Lesbian..Gay..Bisexual..or.Transgender..
dat$issuepubobj59180 <- dat$issuepubobjectivea...At.work..has.the.opportunity.to.work.overtime.for.additional.pay.
dat$issuepubobj59438 <- dat$issuepubobjectivea...Works.in.an.industry.that.uses.pesticides.
dat$issuepubobj59737 <- paste0(dat$issuepubobjectivea...Is.a.US.military.veteran., dat$issuepubobjectivea...Is.currently.serving.in.the.US.military.)
dat$issuepubobj60053 <- dat$issuepubobjectivea...Immigrated.to.the.United.States.in.the.last.5.years.
dat$issuepubobj60590 <- dat$issuepubobjectivea...Works.or.invests.in.the.oil.or.natural.gas.industry.
dat$issuepubobj60787 <- dat$issuepubobjectivea...Has.relatives.in.Iran..North.Korea..or.Russia.

# b votes
"58622	No Taxpayer Funding for Abortion and Abortion Insurance Full Disclosure Act of 2017
58745	Veterans 2nd Amendment Protection Act
59189	American Health Care Act of 2017
59434	Thin Blue Line Act
59632	Financial CHOICE Act of 2017
60153	No Sanctuary for Criminals Act
60587	Ozone Standards Implementation Act of 2017
61788	Tax Cuts and Jobs Act
61746	Save Local Businesses Act"
dat$issuepubsubj58622 <- dat$issuepubsb...Abortion
dat$issuepubsubj58745 <- dat$issuepubsb...Veterans
dat$issuepubsubj59189 <- dat$issuepubsb...Health.care
dat$issuepubsubj59434 <- paste0(dat$issuepubsb...Death.penalty, dat$issuepubsb...Police)
dat$issuepubsubj59632 <- dat$issuepubsb...Regulation.of.banks
dat$issuepubsubj60153 <- dat$issuepubsb...Immigration
dat$issuepubsubj60587 <- dat$issuepubsb...Environmental.protection
dat$issuepubsubj61788 <- dat$issuepubsb...Taxes
dat$issuepubsubj61746 <- dat$issuepubsb...Regulations.that.protect.workers

dat$issuepubobj58622 <- dat$issuepubobjectiveb...Has.considered.having.an.abortion.
dat$issuepubobj58745 <- paste0(dat$issuepubobjectiveb...Owns.a.gun.,
                               dat$issuepubobjectiveb...Is.a.US.military.veteran.,
                               dat$issuepubobjectiveb...Is.currently.serving.in.the.US.military.)
dat$issuepubobj59189 <- dat$issuepubobjectiveb...Receives.government.assistance.to.pay.for.health.care.
dat$issuepubobj59434 <- dat$issuepubobjectiveb...Works.as.a.police.officer..firefighter..or.other.first.responder.
dat$issuepubobj59632 <- dat$issuepubobjectiveb...Works.in.the.financial.industry.
dat$issuepubobj60153 <- dat$issuepubobjectiveb...Immigrated.to.the.United.States.in.the.last.5.years.
dat$issuepubobj60587 <- dat$issuepubobjectiveb...Works.in.an.industry.that.emits.ozone.gases.into.the.atmosphere.
dat$issuepubobj61788 <- dat$issuepubobjectiveb...Makes.over..250.000.per.year.
dat$issuepubobj61746 <- paste0(dat$issuepubobjectiveb...Works.for.a..temp..agency.,
                               dat$issuepubobjectiveb...Owns.a.business.)

# Split up objective into self and penumbra
for(vote in vote.ids) {
  dat[,paste0('issuepubself', vote)] <- grepl('1', dat[,paste0('issuepubobj', vote)])
  dat[,paste0('issuepubpenum', vote)] <- grepl('2', dat[,paste0('issuepubobj', vote)])
}

# Remove unneeded variables
dat <- select(dat, -starts_with('issuepubobjective'),
              -starts_with('issuepubobj'),
              -starts_with('issuepubsb.'),
              -starts_with('issuepubsa.'))

# Knowledge battery
dat$knowledge_controlhouse = ifelse(dat$knowledge_controlhouse==1, 1, 0)
dat$knowledge_roberts = ifelse(dat$knowledge_roberts==3, 1, 0)
dat$knowledge_kelly = ifelse(dat$knowledge_kelly==6, 1, 0)
dat$knowledge_doddfrank = ifelse(dat$knowledge_doddfrank==1, 1, 0)
cor(select(dat, starts_with('knowledge_'))) # spot check
dat$knowledgescale = dat$knowledge_controlhouse + dat$knowledge_roberts + dat$knowledge_kelly + dat$knowledge_doddfrank
dat <- select(dat, -starts_with('knowledge_'))

# PID
dat$pid_dembranch[is.na(dat$pid_dembranch)] = 0
dat$pid_repbranch[is.na(dat$pid_repbranch)] = 0
dat$pid_closerbranch[is.na(dat$pid_closerbranch)] = 0
dat$pid = NA
dat$pid[dat$pid_dembranch == 1] = 1
dat$pid[dat$pid_dembranch == 2] = 2
dat$pid[dat$pid_closerbranch == 2] = 3
dat$pid[dat$pid_closerbranch == 3 | dat$pid_branch1 == 4] = 4
dat$pid[dat$pid_closerbranch == 1] = 5
dat$pid[dat$pid_repbranch == 2] = 6
dat$pid[dat$pid_repbranch == 1] = 7
dat <- select(dat, -starts_with('pid_'))


# MC approval questions
dat$mcapprove <- 8 - dat$mcapprove
dat$mcapprove[is.na(dat$mcapprove)] <- mean(dat$mcapprove, na.rm = TRUE)
dat$mcfavorability <- 8 - dat$mcfavorability
dat$mcfavorability[is.na(dat$mcfavorability)] <- mean(dat$mcfavorability, na.rm = TRUE)
dat$genericballot <- 0
dat$genericballot[dat$X2018generic == 1] <- 2
dat$genericballot[dat$X2018genericlean == 1] <- 1
dat$genericballot[dat$X2018genericlean == 3] <- 0
dat$genericballot[dat$X2018genericlean == 2] <- -1
dat$genericballot[dat$X2018generic == 2] <- -2
library(psych)
fit <- principal(dat[,c('mcapprove', 'mcfavorability', 'genericballot')],
                 nfactors=1, rotate="varimax")
dat$mcratingscale <- fit$scores[,1]



# Our control respondents were given a different experiment. The project
# identifier 'heuristics' distinguishes our pure control from our two endorsement-receiving groups.
dat$ratingsshown <- dat$project == 'heuristics'

# Write data for heuristics project
dat <- dat %>%
  filter(heuristics_eligible == TRUE) %>%
  select(-starts_with('randomvote'), # randomvote codes the treatment shown in the other arm
         -heuristics_eligible, -num_eligible_votes)
write.csv(dat, 'cleaned/heuristics_cleaned_wide.csv')


# Next, write "long" data at vote by person level.
# This is used in Study 2.

# If rep vote is missing, need to set SIG rating to NA since it would not be used
# If SIG rating is missing, need to set rep vote to NA since it would not be used
votes <- read.csv('Ancillary Data/vote_and_group_info_map.csv', stringsAsFactors = FALSE)
for(i in 1:nrow(votes)) {
  sig_id <- votes$sig_id[i]
  house_vote_id <- votes$house_vote_id[i]
  dat[is.na(dat[,paste0('repactualvote', house_vote_id)]),
      paste0('reprating', sig_id)] <- NA
  dat[is.na(dat[,paste0('reprating', sig_id)]),
      paste0('repactualvote', house_vote_id)] <- NA
}

# A helper function to extract issue area from a variable name
substrRight <- function(x, n) {
  substr(x, nchar(x) - n + 1, nchar(x))
}

# This first melt call produces a respondent * issue area data frame with the 
# respondent's guess regarding their Congressmember's vote.
dat$id <- 1:nrow(dat)
guesses <- select(dat, starts_with('repvoteperc'), id) %>%
  reshape2::melt(id = "id")
guesses$variable = as.character(guesses$variable)
guesses$issue = sapply(guesses$variable, FUN=function(x) substrRight(x, 5))
guesses <- guesses %>%
  rename(guess = value) %>%
  select(-variable)

# This second melt call produces a respondent * issue area data frame 
# with their Congressmember's actual vote.
rep.actual.votes <- select(dat, starts_with('repactualvote'), id) %>%
  reshape2::melt(id = "id")
rep.actual.votes$variable = as.character(rep.actual.votes$variable)
rep.actual.votes$issue = sapply(rep.actual.votes$variable, FUN=function(x) substrRight(x, 5))
rep.actual.votes <- rep.actual.votes %>%
  rename(truth = value) %>%
  select(-variable)

# Merge the two long data frames together along with an identifier for 
# which issue area the respondent received a SIG rating for
dat_long = merge(guesses, rep.actual.votes) %>%
  merge(select(dat, id, randomrating_house_vote_id, ratingsshown,
               num_eligible_ratings, survey_version, knowledgescale))

# Calculate the dependent variable: whether the respondent guessed the MoC's vote correctly on each issue.
dat_long$correct = dat_long$guess == dat_long$truth

# NAs come from either MCs not voting or the issue being in the wrong group. We expect just over 50% of NAs.
dat_long <- filter(dat_long, !is.na(dat_long$correct))

# Create variables indicating which treatment group each observation is in
dat_long$randomrating_house_vote_id <- as.character(dat_long$randomrating_house_vote_id)
dat_long$indirect = dat_long$ratingsshown & dat_long$issue != dat_long$randomrating_house_vote_id
dat_long$direct = dat_long$ratingsshown & dat_long$issue == dat_long$randomrating_house_vote_id
dat_long$control = !dat_long$ratingsshown

write.csv(dat_long, 'cleaned/heuristics_cleaned_long.csv')
